bitkeeper revision 1.1159.74.1 (413f431aIw1lNSoAl0H63fnzXoSZ8Q)
authorkaf24@freefall.cl.cam.ac.uk <kaf24@freefall.cl.cam.ac.uk>
Wed, 8 Sep 2004 17:36:26 +0000 (17:36 +0000)
committerkaf24@freefall.cl.cam.ac.uk <kaf24@freefall.cl.cam.ac.uk>
Wed, 8 Sep 2004 17:36:26 +0000 (17:36 +0000)
Fix TLB flushing on page-type changes. In particular, page-table pages
must trigger a flush when their type changes. To simplify the code we also
flush for LDT/GDT pages, but they will change type very infrequently.

xen/arch/x86/domain.c
xen/arch/x86/memory.c
xen/include/asm-x86/mm.h

index 8bbe2968b28310f611baa67daf2aacdf01b6856f..f16597ad73cc1fc29808eb4f7a8a9ae53229579b 100644 (file)
@@ -733,7 +733,6 @@ int construct_dom0(struct domain *p,
         *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
         
         page = &frame_table[mfn];
-        set_bit(_PGC_tlb_flush_on_type_change, &page->count_info);
         if ( !get_page_and_type(page, p, PGT_writable_page) )
             BUG();
 
index 8ca053397388125663e64e4814a3149eb428601d..a41b76065f7360263469df1b453433244889c00f 100644 (file)
@@ -462,7 +462,6 @@ get_page_from_l1e(
     {
         if ( unlikely(!get_page_type(page, PGT_writable_page)) )
             return 0;
-        set_bit(_PGC_tlb_flush_on_type_change, &page->count_info);
     }
 
     return 1;
@@ -774,18 +773,6 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
 
 int alloc_page_type(struct pfn_info *page, unsigned int type)
 {
-    if ( unlikely(test_and_clear_bit(_PGC_tlb_flush_on_type_change, 
-                                     &page->count_info)) )
-    {
-        struct domain *p = page->u.inuse.domain;
-        if ( unlikely(NEED_FLUSH(tlbflush_time[p->processor],
-                                 page->tlbflush_timestamp)) )
-        {
-            perfc_incr(need_flush_tlb_flush);
-            flush_tlb_cpu(p->processor);
-        }
-    }
-
     switch ( type )
     {
     case PGT_l1_page_table:
@@ -833,6 +820,151 @@ void free_page_type(struct pfn_info *page, unsigned int type)
 }
 
 
+void put_page_type(struct pfn_info *page)
+{
+    u32 nx, x, y = page->u.inuse.type_info;
+
+ again:
+    do {
+        x  = y;
+        nx = x - 1;
+
+        ASSERT((x & PGT_count_mask) != 0);
+        ASSERT(x & PGT_validated);
+
+        if ( unlikely((nx & PGT_count_mask) == 0) )
+        {
+            /* Record TLB information for flush later. Races are harmless. */
+            page->tlbflush_timestamp = tlbflush_clock;
+            
+            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) )
+            {
+                /*
+                 * Page-table pages must be unvalidated when count is zero. The
+                 * 'free' is safe because the refcnt is non-zero and validated
+                 * bit is clear => other ops will spin or fail.
+                 */
+                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
+                                           x & ~PGT_validated)) != x) )
+                    goto again;
+                /* We cleared the 'valid bit' so we do the clear up. */
+                free_page_type(page, x & PGT_type_mask);
+                /* Carry on, but with the 'valid bit' now clear. */
+                x  &= ~PGT_validated;
+                nx &= ~PGT_validated;
+            }
+        }
+       else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
+                           (PGT_pinned | 1)) )
+       {
+            /* Page is now only pinned. Make the back pointer mutable again. */
+           nx |= PGT_va_mutable;
+       }
+    }
+    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
+}
+
+
+int get_page_type(struct pfn_info *page, u32 type)
+{
+    u32 nx, x, y = page->u.inuse.type_info;
+
+ again:
+    do {
+        x  = y;
+        nx = x + 1;
+        if ( unlikely((nx & PGT_count_mask) == 0) )
+        {
+            MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
+            return 0;
+        }
+        else if ( unlikely((x & PGT_count_mask) == 0) )
+        {
+            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
+            {
+                /*
+                 * On type change we check to flush stale TLB entries. This 
+                 * may be unnecessary (e.g., page was GDT/LDT) but those
+                 * circumstances should be very rare.
+                 */
+                struct domain *d = page->u.inuse.domain;
+                if ( unlikely(NEED_FLUSH(tlbflush_time[d->processor],
+                                         page->tlbflush_timestamp)) )
+                {
+                    perfc_incr(need_flush_tlb_flush);
+                    flush_tlb_cpu(d->processor);
+                }
+
+                /* We lose existing type, back pointer, and validity. */
+                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
+                nx |= type;
+
+                /* No special validation needed for writable pages. */
+                /* Page tables and GDT/LDT need to be scanned for validity. */
+                if ( type == PGT_writable_page )
+                    nx |= PGT_validated;
+            }
+        }
+        else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
+        {
+            if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
+            {
+                if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
+                     ((type & PGT_type_mask) != PGT_l1_page_table) )
+                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
+                            x & PGT_type_mask, type, page_to_pfn(page));
+                return 0;
+            }
+            else if ( (x & PGT_va_mask) == PGT_va_mutable )
+            {
+                /* The va backpointer is mutable, hence we update it. */
+                nx &= ~PGT_va_mask;
+                nx |= type; /* we know the actual type is correct */
+            }
+            else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
+            {
+                /* The va backpointer wasn't mutable, and is different. */
+                MEM_LOG("Unexpected va backpointer (saw %08x != exp %08x)"
+                        " for pfn %08lx\n", x, type, page_to_pfn(page));
+                return 0;
+            }
+        }
+       else if ( unlikely(!(x & PGT_validated)) )
+        {
+            /* Someone else is updating validation of this page. Wait... */
+            while ( (y = page->u.inuse.type_info) == x )
+            {
+                rep_nop();
+                barrier();
+            }
+            goto again;
+        }
+    }
+    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
+
+    if ( unlikely(!(nx & PGT_validated)) )
+    {
+        /* Try to validate page type; drop the new reference on failure. */
+        if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
+        {
+            MEM_LOG("Error while validating pfn %08lx for type %08x."
+                    " caf=%08x taf=%08x\n",
+                    page_to_pfn(page), type,
+                   page->count_info,
+                   page->u.inuse.type_info);
+            /* Noone else can get a reference. We hold the only ref. */
+            page->u.inuse.type_info = 0;
+            return 0;
+        }
+
+        /* Noone else is updating simultaneously. */
+        __set_bit(_PGT_validated, &page->u.inuse.type_info);
+    }
+
+    return 1;
+}
+
+
 static int do_extended_command(unsigned long ptr, unsigned long val)
 {
     int okay = 1, cpu = smp_processor_id();
@@ -1747,7 +1879,6 @@ int ptwr_do_page_fault(unsigned long addr)
 #ifndef NDEBUG
 void ptwr_status(void)
 {
-    int i;
     unsigned long pte, pfn;
     struct pfn_info *page;
     l2_pgentry_t *pl2e;
index 18e70240d19e281e7290c3c00c5a3a2631812a83..50d598832951c09d1cb32de804c695d635177ff4 100644 (file)
@@ -81,17 +81,14 @@ struct pfn_info
  /* 17-bit count of uses of this frame as its current type. */
 #define PGT_count_mask      ((1<<17)-1)
 
- /* For safety, force a TLB flush when this page's type changes. */
-#define _PGC_tlb_flush_on_type_change 31
-#define PGC_tlb_flush_on_type_change  (1<<_PGC_tlb_flush_on_type_change)
  /* Cleared when the owning guest 'frees' this page. */
-#define _PGC_allocated                30
+#define _PGC_allocated                31
 #define PGC_allocated                 (1<<_PGC_allocated)
  /* This bit is always set, guaranteeing that the count word is never zero. */
-#define _PGC_always_set               29
+#define _PGC_always_set               30
 #define PGC_always_set                (1<<_PGC_always_set)
- /* 29-bit count of references to this frame. */
-#define PGC_count_mask                ((1<<29)-1)
+ /* 30-bit count of references to this frame. */
+#define PGC_count_mask                ((1<<30)-1)
 
 /* We trust the slab allocator in slab.c, and our use of it. */
 #define PageSlab(page)         (1)
@@ -104,7 +101,7 @@ struct pfn_info
     do {                                                                    \
         (_pfn)->u.inuse.domain = (_dom);                                    \
         /* The incremented type count is intended to pin to 'writable'. */  \
-        (_pfn)->u.inuse.type_info  = PGT_writable_page | PGT_validated | 1; \
+        (_pfn)->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;  \
         wmb(); /* install valid domain ptr before updating refcnt. */       \
         spin_lock(&(_dom)->page_alloc_lock);                                \
         /* _dom holds an allocation reference */                            \
@@ -143,155 +140,34 @@ static inline int get_page(struct pfn_info *page,
                            struct domain *domain)
 {
     u32 x, nx, y = page->count_info;
-    struct domain *p, *np = page->u.inuse.domain;
+    struct domain *d, *nd = page->u.inuse.domain;
 
     do {
         x  = y;
         nx = x + 1;
-        p  = np;
+        d  = nd;
         if ( unlikely((x & PGC_count_mask) == 0) ||  /* Not allocated? */
              unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
-             unlikely(p != domain) )                 /* Wrong owner? */
+             unlikely(d != domain) )                 /* Wrong owner? */
         {
             DPRINTK("Error pfn %08lx: ed=%p, sd=%p, caf=%08x, taf=%08x\n",
-                    page_to_pfn(page), domain, p,
+                    page_to_pfn(page), domain, d,
                     x, page->u.inuse.type_info);
             return 0;
         }
         __asm__ __volatile__(
             LOCK_PREFIX "cmpxchg8b %3"
-            : "=d" (np), "=a" (y), "=c" (p),
+            : "=d" (nd), "=a" (y), "=c" (d),
               "=m" (*(volatile u64 *)(&page->count_info))
-            : "0" (p), "1" (x), "c" (p), "b" (nx) );
-    }
-    while ( unlikely(np != p) || unlikely(y != x) );
-
-    return 1;
-}
-
-
-static inline void put_page_type(struct pfn_info *page)
-{
-    u32 nx, x, y = page->u.inuse.type_info;
-
- again:
-    do {
-        x  = y;
-        nx = x - 1;
-        if ( unlikely((nx & PGT_count_mask) == 0) )
-        {
-            page->tlbflush_timestamp = tlbflush_clock;
-            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
-                 likely(nx & PGT_validated) )
-            {
-                /*
-                 * Page-table pages must be unvalidated when count is zero. The
-                 * 'free' is safe because the refcnt is non-zero and the
-                 * validated bit is clear => other ops will spin or fail.
-                 */
-                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
-                                           x & ~PGT_validated)) != x) )
-                    goto again;
-                /* We cleared the 'valid bit' so we must do the clear up. */
-                free_page_type(page, x & PGT_type_mask);
-                /* Carry on as we were, but with the 'valid bit' now clear. */
-                x  &= ~PGT_validated;
-                nx &= ~PGT_validated;
-            }
-        }
-       else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
-                           (PGT_pinned | 1)) )
-       {
-            /* Page is now only pinned. Make the back pointer mutable again. */
-           nx |= PGT_va_mutable;
-       }
-    }
-    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
-}
-
-
-static inline int get_page_type(struct pfn_info *page, u32 type)
-{
-    u32 nx, x, y = page->u.inuse.type_info;
- again:
-    do {
-        x  = y;
-        nx = x + 1;
-        if ( unlikely((nx & PGT_count_mask) == 0) )
-        {
-            DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page));
-            return 0;
-        }
-        else if ( unlikely((x & PGT_count_mask) == 0) )
-        {
-            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
-            {
-                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
-                nx |= type;
-                /* No extra validation needed for writable pages. */
-                if ( type == PGT_writable_page )
-                    nx |= PGT_validated;
-            }
-        }
-        else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
-        {
-            if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
-            {
-#ifdef VERBOSE
-                if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
-                     ((type & PGT_type_mask) != PGT_l1_page_table) )
-                    DPRINTK("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
-                            x & PGT_type_mask, type, page_to_pfn(page));
-#endif
-                return 0;
-            }
-            else if ( (x & PGT_va_mask) == PGT_va_mutable )
-            {
-                /* The va backpointer is mutable, hence we update it. */
-                nx &= ~PGT_va_mask;
-                nx |= type; /* we know the actual type is correct */
-            }
-            else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
-            {
-                /* The va backpointer wasn't mutable, and is different. */
-                DPRINTK("Unexpected va backpointer (saw %08x != exp %08x)"
-                        " for pfn %08lx\n", x, type, page_to_pfn(page));
-                return 0;
-            }
-        }
-       else if ( unlikely(!(x & PGT_validated)) )
-        {
-            /* Someone else is updating validation of this page. Wait... */
-            while ( (y = page->u.inuse.type_info) == x )
-            {
-                rep_nop();
-                barrier();
-            }
-            goto again;
-        }
-    }
-    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
-
-    if ( unlikely(!(nx & PGT_validated)) )
-    {
-        /* Try to validate page type; drop the new reference on failure. */
-        if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
-        {
-            DPRINTK("Error while validating pfn %08lx for type %08x."
-                    " caf=%08x taf=%08x\n",
-                    page_to_pfn(page), type,
-                   page->count_info,
-                   page->u.inuse.type_info);
-            put_page_type(page);
-            return 0;
-        }
-
-        set_bit(_PGT_validated, &page->u.inuse.type_info);
+            : "0" (d), "1" (x), "c" (d), "b" (nx) );
     }
+    while ( unlikely(nd != d) || unlikely(y != x) );
 
     return 1;
 }
 
+void put_page_type(struct pfn_info *page);
+int  get_page_type(struct pfn_info *page, u32 type);
 
 static inline void put_page_and_type(struct pfn_info *page)
 {